import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import statsmodels.api as sm
import scipy as sp
import statsmodels
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import OneClassSVM
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from scipy.stats import chi2
from sklearn.covariance import MinCovDet
from scipy.spatial import distance
import plotly.express as px
import pandas as pd
import cufflinks as cf
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
# to get the connection
init_notebook_mode(connected = True)
# plotly also serves online,
# but we are using just a sample
cf.go_offline()
mega = pd.read_excel("Mega.xlsx")
mega.head()
| followers_count | deliverable_price_in_dollars | spot_compensation_in_dollars | unit_id | type | |
|---|---|---|---|---|---|
| 0 | 520902 | 5125.0 | 2500 | 52950 | TikTok |
| 1 | 542879 | 2050.0 | 1000 | 50327 | TikTok |
| 2 | 543636 | 27675.0 | 13500 | 53414 | TikTok |
| 3 | 543636 | 27675.0 | 13500 | 52197 | TikTok |
| 4 | 543636 | 27675.0 | 13500 | 52196 | TikTok |
mega.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 15 entries, 0 to 14 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 followers_count 15 non-null int64 1 deliverable_price_in_dollars 15 non-null float64 2 spot_compensation_in_dollars 15 non-null int64 3 unit_id 15 non-null int64 4 type 15 non-null object dtypes: float64(1), int64(3), object(1) memory usage: 728.0+ bytes
x = mega["followers_count"]
y = mega["deliverable_price_in_dollars"]
model = sm.OLS(y, x).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
plt.savefig('output.png')
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=15
x = mega["followers_count"]
y = mega["spot_compensation_in_dollars"]
model = sm.OLS(y, x).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
plt.savefig('output.png')
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=15
mega.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| followers_count | 15.0 | 662100.533333 | 162168.245100 | 520902.0 | 543636.0 | 561262.0 | 732866.5 | 956500.0 |
| deliverable_price_in_dollars | 15.0 | 11112.166667 | 10325.153184 | 2050.0 | 3792.5 | 6500.0 | 18125.0 | 27675.0 |
| spot_compensation_in_dollars | 15.0 | 6176.666667 | 5971.565560 | 1000.0 | 1850.0 | 4000.0 | 9750.0 | 20000.0 |
| unit_id | 15.0 | 49249.000000 | 3907.544571 | 40310.0 | 48138.0 | 50327.0 | 52043.0 | 53414.0 |
sns.pairplot(mega)
<seaborn.axisgrid.PairGrid at 0x1264e7557c0>
#Outlier Treatment
def outlier_detect(df):
for i in df.describe().columns:
Q1=df.describe().at['25%',i]
Q3=df.describe().at['75%',i]
IQR=Q3 - Q1
LTV=Q1 - 1.5 * IQR
UTV=Q3 + 1.5 * IQR
x=np.array(df[i])
p=[]
for j in x:
if j < LTV or j>UTV:
p.append(df[i].median())
else:
p.append(j)
df[i]=p
return df
mega = outlier_detect(mega)
mega.drop("type", axis=1, inplace=True)
mega.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 15 entries, 0 to 14 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 followers_count 15 non-null int64 1 deliverable_price_in_dollars 15 non-null float64 2 spot_compensation_in_dollars 15 non-null int64 3 unit_id 15 non-null float64 dtypes: float64(2), int64(2) memory usage: 608.0 bytes
fig = px.scatter(mega, x="followers_count", y="deliverable_price_in_dollars",
color="followers_count",
title="Scatter Plot followers_count vs deliverable_price_in_dollars",
)
fig.update_traces(marker=dict(size=12,
line=dict(width=2,
color='DarkSlateGrey')),
selector=dict(mode='markers'))
fig.show()
fig = px.scatter(mega, x="followers_count", y="spot_compensation_in_dollars",
hover_data=["unit_id"],
color="followers_count",
title="Scatter Plot followers_count vs spot_compensation_in_dollars",
)
fig.update_traces(marker=dict(size=12,
line=dict(width=2,
color='DarkSlateGrey')),
selector=dict(mode='markers'))
fig.show()
sns.set_style("darkgrid")
sns.heatmap(mega.corr(), annot=True)
<AxesSubplot:>
X = mega['followers_count']
y = mega["deliverable_price_in_dollars"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)
# In order to feed x to sklearn, it should be a 2D array (a matrix)
# Therefore, we must reshape X_train and X_test
# Note that this will not be needed when we've got more than 1 feature (as the inputs will be a 2D array by default)
X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)
# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)
(10, 1) (10,)
# identify outliers in the training dataset
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train1, y_train1 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train1, y_train1)
model = sm.OLS(X_train1, y_train1).fit()
predictions = model.predict(x)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat)
print('Isolation Forest MAE: %.3f' % mae)
print('Isolation Forest R2: %.3f' % reg.score(X_train1, y_train1))
print('RMSE: %.3f' % rmse)
Isolation Forest MAE: 8699.667 Isolation Forest R2: 0.077 RMSE: 95822692.653
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train1,y_train1)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Isolation Forest', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train1, y_train1).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=9
# identify outliers in the training dataset
ee = EllipticEnvelope(contamination=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train2, y_train2 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train2, y_train2)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Minimum Covariance Determinant MAE: %.3f' % mae)
print('Minimum Covariance Determinant R2: %.3f' % reg.score(X_train2, y_train2))
print('RMSE: %.3f' % rmse)
Minimum Covariance Determinant MAE: 8699.667 Minimum Covariance Determinant R2: 0.077 RMSE: 9788.907
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train2,y_train2)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Minimum Covariance Determinant', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train2, y_train2).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=9
# identify outliers in the training dataset
lof = LocalOutlierFactor(n_neighbors=10, contamination=0.1)
yhat = lof.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train3, y_train3 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train3, y_train3)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Local Outlier Factor MAE: %.3f' % mae)
print('Local Outlier Factor R2: %.3f' % reg.score(X_train3, y_train3))
print('RMSE: %.3f' % rmse)
Local Outlier Factor MAE: 8662.764 Local Outlier Factor R2: 0.125 RMSE: 9826.529
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train3,y_train3)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Local Outlier Factor', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train3, y_train3).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=9
# identify outliers in the training dataset
ee = OneClassSVM(nu=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train4, y_train4 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train4, y_train4)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('One-Class SVM MAE: %.3f' % mae)
print('One-Class SVM R2: %.3f' % reg.score(X_train4, y_train4))
print('RMSE: %.3f' % rmse)
One-Class SVM MAE: 9635.948 One-Class SVM R2: 0.233 RMSE: 11021.574
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train4,y_train4)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('One-Class SVM', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train4, y_train4).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=8
poly = PolynomialFeatures(degree=3)
X_train4_poly = poly.fit_transform(X_train4)
X_train4_poly
model = LinearRegression()
train_y_ = model.fit(X_train4_poly, y_train4)
X_test_poly = poly.fit_transform(X_test)
yhat = model.predict(X_test_poly)
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Polynomial MAE: %.3f' % mae)
print('Polynomial R2: %.3f' % model.score(X_train4_poly, y_train4))
print('RMSE: %.3f' % rmse)
Polynomial MAE: 21761.839 Polynomial R2: 0.845 RMSE: 26934.130
X = mega['followers_count']
y = mega["spot_compensation_in_dollars"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)
# In order to feed x to sklearn, it should be a 2D array (a matrix)
# Therefore, we must reshape X_train and X_test
# Note that this will not be needed when we've got more than 1 feature (as the inputs will be a 2D array by default)
X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)
# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)
(10, 1) (10,)
# identify outliers in the training dataset
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train1, y_train1 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train1, y_train1)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat)
print('Isolation Forest MAE: %.3f' % mae)
print('Isolation Forest R2: %.3f' % reg.score(X_train1, y_train1))
print('RMSE: %.3f' % rmse)
Isolation Forest MAE: 5419.028 Isolation Forest R2: 0.071 RMSE: 49659849.050
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train1,y_train1)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Isolation Forest', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train1, y_train1).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=9
# identify outliers in the training dataset
ee = EllipticEnvelope(contamination=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train2, y_train2 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train2, y_train2)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Minimum Covariance Determinant MAE: %.3f' % mae)
print('Minimum Covariance Determinant R2: %.3f' % reg.score(X_train2, y_train2))
print('RMSE: %.3f' % rmse)
Minimum Covariance Determinant MAE: 5419.028 Minimum Covariance Determinant R2: 0.071 RMSE: 7046.974
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train2,y_train2)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Minimum Covariance Determinant', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train2, y_train2).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=9
# identify outliers in the training dataset
lof = LocalOutlierFactor(n_neighbors=10, contamination=0.1)
yhat = lof.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train3, y_train3 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train3, y_train3)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Local Outlier Factor MAE: %.3f' % mae)
print('Local Outlier Factor R2: %.3f' % reg.score(X_train3, y_train3))
print('RMSE: %.3f' % rmse)
Local Outlier Factor MAE: 5381.151 Local Outlier Factor R2: 0.122 RMSE: 7054.052
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train3,y_train3)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Local Outlier Factor', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train3, y_train3).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=9
# identify outliers in the training dataset
ee = OneClassSVM(nu=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train4, y_train4 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train4, y_train4)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('One-Class SVM MAE: %.3f' % mae)
print('One-Class SVM R2: %.3f' % reg.score(X_train4, y_train4))
print('RMSE: %.3f' % rmse)
One-Class SVM MAE: 5724.853 One-Class SVM R2: 0.233 RMSE: 7171.741
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train4,y_train4)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('One-Class SVM', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train4, y_train4).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=8
poly = PolynomialFeatures(degree=3)
X_train4_poly = poly.fit_transform(X_train4)
X_train4_poly
model = LinearRegression()
train_y_ = model.fit(X_train4_poly, y_train4)
X_test_poly = poly.fit_transform(X_test)
yhat = model.predict(X_test_poly)
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Polynomial MAE: %.3f' % mae)
print('Polynomial R2: %.3f' % model.score(X_train4_poly, y_train4))
print('RMSE: %.3f' % rmse)
Polynomial MAE: 13459.696 Polynomial R2: 0.848 RMSE: 17154.534
While predicting spot_compensation_in_dollars and deliverable_price_in_dollars with respect to followers_count and unit_id etc. It gives -7 R2 Score which means these variables are not highly correlated in predicting the both price colunn attributes(deliverable_price and spot price). They have no relation for this this
micro = pd.read_excel("micro.xlsx")
micro.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 109 entries, 0 to 108 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 followers_count 109 non-null int64 1 deliverable_price_in_dollars 109 non-null float64 2 spot_compensation_in_dollars 109 non-null float64 3 unit_id 109 non-null int64 dtypes: float64(2), int64(2) memory usage: 3.5 KB
x = micro["followers_count"]
y = micro["deliverable_price_in_dollars"]
model = sm.OLS(y, x).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 10}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
plt.savefig('output.png')
x = micro["followers_count"]
y = micro["spot_compensation_in_dollars"]
model = sm.OLS(y, x).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 10}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
plt.savefig('output.png')
micro.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| followers_count | 109.0 | 25394.633028 | 10844.477611 | 10036.0 | 16061.0 | 24008.0 | 34088.0 | 48892.0 |
| deliverable_price_in_dollars | 109.0 | 5141.442294 | 6171.000726 | 102.5 | 1947.5 | 2562.5 | 5070.0 | 26650.0 |
| spot_compensation_in_dollars | 109.0 | 2521.110826 | 3013.192166 | 50.0 | 950.0 | 1250.0 | 2500.0 | 13000.0 |
| unit_id | 109.0 | 49666.779817 | 3405.725817 | 39878.0 | 47757.0 | 50909.0 | 52073.0 | 54998.0 |
sns.pairplot(micro)
<seaborn.axisgrid.PairGrid at 0x126535bde20>
#Outlier Treatment
def outlier_detect(df):
for i in df.describe().columns:
Q1=df.describe().at['25%',i]
Q3=df.describe().at['75%',i]
IQR=Q3 - Q1
LTV=Q1 - 1.5 * IQR
UTV=Q3 + 1.5 * IQR
x=np.array(df[i])
p=[]
for j in x:
if j < LTV or j>UTV:
p.append(df[i].median())
else:
p.append(j)
df[i]=p
return df
micro = outlier_detect(micro)
#micro.drop("type", axis=1, inplace=True)
micro.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 109 entries, 0 to 108 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 followers_count 109 non-null int64 1 deliverable_price_in_dollars 109 non-null float64 2 spot_compensation_in_dollars 109 non-null float64 3 unit_id 109 non-null float64 dtypes: float64(3), int64(1) memory usage: 3.5 KB
fig = px.scatter(micro, x="followers_count", y="deliverable_price_in_dollars",
color="followers_count",
title="Scatter Plot followers_count vs deliverable_price_in_dollars",
)
fig.update_traces(marker=dict(size=12,
line=dict(width=2,
color='DarkSlateGrey')),
selector=dict(mode='markers'))
fig.show()
fig = px.scatter(micro, x="followers_count", y="spot_compensation_in_dollars",
hover_data=["unit_id"],
color="followers_count",
title="Scatter Plot followers_count vs spot_compensation_in_dollars",
)
fig.update_traces(marker=dict(size=12,
line=dict(width=2,
color='DarkSlateGrey')),
selector=dict(mode='markers'))
fig.show()
sns.set_style("darkgrid")
sns.heatmap(micro.corr(), annot=True)
<AxesSubplot:>
X = micro['followers_count']
y = micro["deliverable_price_in_dollars"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)
# In order to feed x to sklearn, it should be a 2D array (a matrix)
# Therefore, we must reshape X_train and X_test
# Note that this will not be needed when we've got more than 1 feature (as the inputs will be a 2D array by default)
X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)
# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)
(73, 1) (73,)
# identify outliers in the training dataset
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train1, y_train1 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train1, y_train1)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat)
print('Isolation Forest MAE: %.3f' % mae)
print('Isolation Forest R2: %.3f' % reg.score(X_train1, y_train1))
print('RMSE: %.3f' % rmse)
Isolation Forest MAE: 1330.270 Isolation Forest R2: 0.012 RMSE: 2895660.631
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train1,y_train1)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Isolation Forest', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train1, y_train1).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
# identify outliers in the training dataset
ee = EllipticEnvelope(contamination=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train2, y_train2 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train2, y_train2)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Minimum Covariance Determinant MAE: %.3f' % mae)
print('Minimum Covariance Determinant R2: %.3f' % reg.score(X_train2, y_train2))
print('RMSE: %.3f' % rmse)
Minimum Covariance Determinant MAE: 1375.852 Minimum Covariance Determinant R2: 0.046 RMSE: 1767.391
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train2,y_train2)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Minimum Covariance Determinant', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train2, y_train2).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
# identify outliers in the training dataset
lof = LocalOutlierFactor(n_neighbors=10, contamination=0.1)
yhat = lof.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train3, y_train3 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train3, y_train3)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Local Outlier Factor MAE: %.3f' % mae)
print('Local Outlier Factor R2: %.3f' % reg.score(X_train3, y_train3))
print('RMSE: %.3f' % rmse)
Local Outlier Factor MAE: 1373.723 Local Outlier Factor R2: 0.041 RMSE: 1763.470
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train3,y_train3)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Local Outlier Factor', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train3, y_train3).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
# identify outliers in the training dataset
ee = OneClassSVM(nu=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train4, y_train4 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train4, y_train4)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('One-Class SVM MAE: %.3f' % mae)
print('One-Class SVM R2: %.3f' % reg.score(X_train4, y_train4))
print('RMSE: %.3f' % rmse)
One-Class SVM MAE: 1380.030 One-Class SVM R2: 0.071 RMSE: 1777.913
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train4,y_train4)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('One-Class SVM', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train4, y_train4).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
poly = PolynomialFeatures(degree=4)
X_train4_poly = poly.fit_transform(X_train4)
X_train4_poly
model = LinearRegression()
train_y_ = model.fit(X_train4_poly, y_train4)
X_test_poly = poly.fit_transform(X_test)
yhat = model.predict(X_test_poly)
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Polynomial MAE: %.3f' % mae)
print('Polynomial R2: %.3f' % model.score(X_train4_poly, y_train4))
print('RMSE: %.3f' % rmse)
Polynomial MAE: 1494.071 Polynomial R2: 0.214 RMSE: 2016.141
X = mega['followers_count']
y = mega["spot_compensation_in_dollars"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)
# In order to feed x to sklearn, it should be a 2D array (a matrix)
# Therefore, we must reshape X_train and X_test
# Note that this will not be needed when we've got more than 1 feature (as the inputs will be a 2D array by default)
X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)
# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)
(10, 1) (10,)
# identify outliers in the training dataset
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train1, y_train1 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train1, y_train1)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat)
print('Isolation Forest MAE: %.3f' % mae)
print('Isolation Forest R2: %.3f' % reg.score(X_train1, y_train1))
print('RMSE: %.3f' % rmse)
Isolation Forest MAE: 5419.028 Isolation Forest R2: 0.071 RMSE: 49659849.050
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train1,y_train1)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Isolation Forest', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train1, y_train1).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=9
# identify outliers in the training dataset
ee = EllipticEnvelope(contamination=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train2, y_train2 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train2, y_train2)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Minimum Covariance Determinant MAE: %.3f' % mae)
print('Minimum Covariance Determinant R2: %.3f' % reg.score(X_train2, y_train2))
print('RMSE: %.3f' % rmse)
Minimum Covariance Determinant MAE: 5419.028 Minimum Covariance Determinant R2: 0.071 RMSE: 7046.974
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train2,y_train2)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Minimum Covariance Determinant', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train2, y_train2).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=9
# identify outliers in the training dataset
lof = LocalOutlierFactor(n_neighbors=10, contamination=0.1)
yhat = lof.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train3, y_train3 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train3, y_train3)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Local Outlier Factor MAE: %.3f' % mae)
print('Local Outlier Factor R2: %.3f' % reg.score(X_train3, y_train3))
print('RMSE: %.3f' % rmse)
Local Outlier Factor MAE: 5381.151 Local Outlier Factor R2: 0.122 RMSE: 7054.052
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train3,y_train3)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Local Outlier Factor', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train3, y_train3).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=9
# identify outliers in the training dataset
ee = OneClassSVM(nu=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train4, y_train4 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train4, y_train4)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('One-Class SVM MAE: %.3f' % mae)
print('One-Class SVM R2: %.3f' % reg.score(X_train4, y_train4))
print('RMSE: %.3f' % rmse)
One-Class SVM MAE: 5724.853 One-Class SVM R2: 0.233 RMSE: 7171.741
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train4,y_train4)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('One-Class SVM', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train4, y_train4).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=8
poly = PolynomialFeatures(degree=3)
X_train4_poly = poly.fit_transform(X_train4)
X_train4_poly
model = LinearRegression()
train_y_ = model.fit(X_train4_poly, y_train4)
X_test_poly = poly.fit_transform(X_test)
yhat = model.predict(X_test_poly)
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Polynomial MAE: %.3f' % mae)
print('Polynomial R2: %.3f' % model.score(X_train4_poly, y_train4))
print('RMSE: %.3f' % rmse)
Polynomial MAE: 13459.696 Polynomial R2: 0.848 RMSE: 17154.534
mid = pd.read_excel("mid.xlsx")
mid.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 88 entries, 0 to 87 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 followers_count 88 non-null int64 1 deliverable_price_in_dollars 88 non-null float64 2 spot_compensation_in_dollars 88 non-null float64 3 unit_id 88 non-null int64 dtypes: float64(2), int64(2) memory usage: 2.9 KB
x = mid["followers_count"]
y = mid["deliverable_price_in_dollars"]
model = sm.OLS(y, x).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 10}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
plt.savefig('output.png')
x = mid["followers_count"]
y = mid["spot_compensation_in_dollars"]
model = sm.OLS(y, x).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 10}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
plt.savefig('output.png')
mid.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| followers_count | 88.0 | 109674.488636 | 43514.781055 | 50660.0 | 66922.000 | 101134.5 | 138673.0 | 198560.0 |
| deliverable_price_in_dollars | 88.0 | 6128.681932 | 5427.416630 | 410.0 | 2946.875 | 4715.0 | 8200.0 | 41000.0 |
| spot_compensation_in_dollars | 88.0 | 3012.882500 | 2647.292315 | 200.0 | 1437.500 | 2625.0 | 4000.0 | 20000.0 |
| unit_id | 88.0 | 49246.329545 | 3459.655845 | 39967.0 | 47266.000 | 49445.0 | 51733.5 | 55045.0 |
sns.pairplot(mid)
<seaborn.axisgrid.PairGrid at 0x126514f6d90>
#Outlier Treatment
def outlier_detect(df):
for i in df.describe().columns:
Q1=df.describe().at['25%',i]
Q3=df.describe().at['75%',i]
IQR=Q3 - Q1
LTV=Q1 - 1.5 * IQR
UTV=Q3 + 1.5 * IQR
x=np.array(df[i])
p=[]
for j in x:
if j < LTV or j>UTV:
p.append(df[i].median())
else:
p.append(j)
df[i]=p
return df
mid = outlier_detect(mid)
#mid.drop("type", axis=1, inplace=True)
fig = px.scatter(mid, x="followers_count", y="deliverable_price_in_dollars",
color="followers_count",
title="Scatter Plot followers_count vs deliverable_price_in_dollars",
)
fig.update_traces(marker=dict(size=12,
line=dict(width=2,
color='DarkSlateGrey')),
selector=dict(mode='markers'))
fig.show()
fig = px.scatter(mid, x="followers_count", y="spot_compensation_in_dollars",
hover_data=["unit_id"],
color="followers_count",
title="Scatter Plot followers_count vs spot_compensation_in_dollars",
)
fig.update_traces(marker=dict(size=12,
line=dict(width=2,
color='DarkSlateGrey')),
selector=dict(mode='markers'))
fig.show()
sns.set_style("darkgrid")
sns.heatmap(mid.corr(), annot=True)
<AxesSubplot:>
X = mid['followers_count']
y = mid["deliverable_price_in_dollars"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)
# In order to feed x to sklearn, it should be a 2D array (a matrix)
# Therefore, we must reshape X_train and X_test
# Note that this will not be needed when we've got more than 1 feature (as the inputs will be a 2D array by default)
X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)
# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)
(58, 1) (58,)
# identify outliers in the training dataset
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train1, y_train1 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train1, y_train1)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat)
print('Isolation Forest MAE: %.3f' % mae)
print('Isolation Forest R2: %.3f' % reg.score(X_train1, y_train1))
print('RMSE: %.3f' % rmse)
Isolation Forest MAE: 2312.041 Isolation Forest R2: 0.002 RMSE: 7308077.099
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train1,y_train1)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Isolation Forest', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train1, y_train1).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
# identify outliers in the training dataset
ee = EllipticEnvelope(contamination=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train2, y_train2 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train2, y_train2)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Minimum Covariance Determinant MAE: %.3f' % mae)
print('Minimum Covariance Determinant R2: %.3f' % reg.score(X_train2, y_train2))
print('RMSE: %.3f' % rmse)
Minimum Covariance Determinant MAE: 2285.355 Minimum Covariance Determinant R2: 0.009 RMSE: 2692.046
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train2,y_train2)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Minimum Covariance Determinant', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train2, y_train2).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
# identify outliers in the training dataset
lof = LocalOutlierFactor(n_neighbors=10, contamination=0.1)
yhat = lof.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train3, y_train3 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train3, y_train3)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Local Outlier Factor MAE: %.3f' % mae)
print('Local Outlier Factor R2: %.3f' % reg.score(X_train3, y_train3))
print('RMSE: %.3f' % rmse)
Local Outlier Factor MAE: 2293.635 Local Outlier Factor R2: 0.006 RMSE: 2693.350
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train3,y_train3)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Local Outlier Factor', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train3, y_train3).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
# identify outliers in the training dataset
ee = OneClassSVM(nu=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train4, y_train4 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train4, y_train4)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('One-Class SVM MAE: %.3f' % mae)
print('One-Class SVM R2: %.3f' % reg.score(X_train4, y_train4))
print('RMSE: %.3f' % rmse)
One-Class SVM MAE: 2310.145 One-Class SVM R2: 0.004 RMSE: 2707.950
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train4,y_train4)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('One-Class SVM', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train4, y_train4).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
poly = PolynomialFeatures(degree=3)
X_train3_poly = poly.fit_transform(X_train3)
X_train3_poly
model = LinearRegression()
train_y_ = model.fit(X_train3_poly, y_train3)
X_test_poly = poly.fit_transform(X_test)
yhat = model.predict(X_test_poly)
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Polynomial MAE: %.3f' % mae)
print('Polynomial R2: %.3f' % model.score(X_train3_poly, y_train3))
print('RMSE: %.3f' % rmse)
Polynomial MAE: 2487.203 Polynomial R2: 0.085 RMSE: 2936.645
X = mega['followers_count']
y = mega["spot_compensation_in_dollars"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)
# In order to feed x to sklearn, it should be a 2D array (a matrix)
# Therefore, we must reshape X_train and X_test
# Note that this will not be needed when we've got more than 1 feature (as the inputs will be a 2D array by default)
X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)
# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)
(10, 1) (10,)
# identify outliers in the training dataset
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train1, y_train1 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train1, y_train1)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat)
print('Isolation Forest MAE: %.3f' % mae)
print('Isolation Forest R2: %.3f' % reg.score(X_train1, y_train1))
print('RMSE: %.3f' % rmse)
Isolation Forest MAE: 5419.028 Isolation Forest R2: 0.071 RMSE: 49659849.050
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train1,y_train1)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Isolation Forest', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train1, y_train1).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=9
# identify outliers in the training dataset
ee = EllipticEnvelope(contamination=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train2, y_train2 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train2, y_train2)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Minimum Covariance Determinant MAE: %.3f' % mae)
print('Minimum Covariance Determinant R2: %.3f' % reg.score(X_train2, y_train2))
print('RMSE: %.3f' % rmse)
Minimum Covariance Determinant MAE: 5419.028 Minimum Covariance Determinant R2: 0.071 RMSE: 7046.974
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train2,y_train2)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Minimum Covariance Determinant', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train2, y_train2).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=9
# identify outliers in the training dataset
lof = LocalOutlierFactor(n_neighbors=10, contamination=0.1)
yhat = lof.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train3, y_train3 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train3, y_train3)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Local Outlier Factor MAE: %.3f' % mae)
print('Local Outlier Factor R2: %.3f' % reg.score(X_train3, y_train3))
print('RMSE: %.3f' % rmse)
Local Outlier Factor MAE: 5381.151 Local Outlier Factor R2: 0.122 RMSE: 7054.052
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train3,y_train3)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Local Outlier Factor', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train3, y_train3).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=9
# identify outliers in the training dataset
ee = OneClassSVM(nu=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train4, y_train4 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train4, y_train4)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('One-Class SVM MAE: %.3f' % mae)
print('One-Class SVM R2: %.3f' % reg.score(X_train4, y_train4))
print('RMSE: %.3f' % rmse)
One-Class SVM MAE: 5724.853 One-Class SVM R2: 0.233 RMSE: 7171.741
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train4,y_train4)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('One-Class SVM', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train4, y_train4).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=8
poly = PolynomialFeatures(degree=3)
X_train4_poly = poly.fit_transform(X_train4)
X_train4_poly
model = LinearRegression()
train_y_ = model.fit(X_train4_poly, y_train4)
X_test_poly = poly.fit_transform(X_test)
yhat = model.predict(X_test_poly)
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Polynomial MAE: %.3f' % mae)
print('Polynomial R2: %.3f' % model.score(X_train4_poly, y_train4))
print('RMSE: %.3f' % rmse)
Polynomial MAE: 13459.696 Polynomial R2: 0.848 RMSE: 17154.534
macro = pd.read_excel("macro.xlsx")
macro.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 50 entries, 0 to 49 Data columns (total 4 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 followers_count 50 non-null int64 1 deliverable_price_in_dollars 50 non-null float64 2 spot_compensation_in_dollars 50 non-null int64 3 unit_id 50 non-null int64 dtypes: float64(1), int64(3) memory usage: 1.7 KB
x = macro["followers_count"]
y = macro["deliverable_price_in_dollars"]
model = sm.OLS(y, x).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 10}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
plt.savefig('output.png')
x = macro["followers_count"]
y = macro["spot_compensation_in_dollars"]
model = sm.OLS(y, x).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 10}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
plt.savefig('output.png')
macro.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| followers_count | 50.0 | 309281.920 | 95771.230089 | 205129.0 | 233578.25 | 276740.0 | 382384.00 | 495068.0 |
| deliverable_price_in_dollars | 50.0 | 7036.935 | 4601.449859 | 615.0 | 3536.25 | 6150.0 | 9225.00 | 20500.0 |
| spot_compensation_in_dollars | 50.0 | 3860.700 | 2969.729942 | 300.0 | 2000.00 | 3000.0 | 4500.00 | 15000.0 |
| unit_id | 50.0 | 48575.600 | 3787.040803 | 39881.0 | 45904.25 | 49804.0 | 51545.25 | 53236.0 |
sns.pairplot(macro)
<seaborn.axisgrid.PairGrid at 0x12654764400>
#Outlier Treatment
def outlier_detect(df):
for i in df.describe().columns:
Q1=df.describe().at['25%',i]
Q3=df.describe().at['75%',i]
IQR=Q3 - Q1
LTV=Q1 - 1.5 * IQR
UTV=Q3 + 1.5 * IQR
x=np.array(df[i])
p=[]
for j in x:
if j < LTV or j>UTV:
p.append(df[i].median())
else:
p.append(j)
df[i]=p
return df
macro = outlier_detect(macro)
#macro.drop("type", axis=1, inplace=True)
fig = px.scatter(macro, x="followers_count", y="deliverable_price_in_dollars",
color="followers_count",
title="Scatter Plot followers_count vs deliverable_price_in_dollars",
)
fig.update_traces(marker=dict(size=12,
line=dict(width=2,
color='DarkSlateGrey')),
selector=dict(mode='markers'))
fig.show()
fig = px.scatter(macro, x="followers_count", y="spot_compensation_in_dollars",
hover_data=["unit_id"],
color="followers_count",
title="Scatter Plot followers_count vs spot_compensation_in_dollars",
)
fig.update_traces(marker=dict(size=12,
line=dict(width=2,
color='DarkSlateGrey')),
selector=dict(mode='markers'))
fig.show()
sns.set_style("darkgrid")
sns.heatmap(macro.corr(), annot=True)
<AxesSubplot:>
X = macro['followers_count']
y = macro["deliverable_price_in_dollars"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)
# In order to feed x to sklearn, it should be a 2D array (a matrix)
# Therefore, we must reshape X_train and X_test
# Note that this will not be needed when we've got more than 1 feature (as the inputs will be a 2D array by default)
X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)
# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)
(33, 1) (33,)
# identify outliers in the training dataset
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train1, y_train1 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train1, y_train1)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat)
print('Isolation Forest MAE: %.3f' % mae)
print('Isolation Forest R2: %.3f' % reg.score(X_train1, y_train1))
print('RMSE: %.3f' % rmse)
Isolation Forest MAE: 3065.037 Isolation Forest R2: 0.046 RMSE: 15518326.744
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train1,y_train1)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Isolation Forest', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train1, y_train1).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
# identify outliers in the training dataset
ee = EllipticEnvelope(contamination=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train2, y_train2 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train2, y_train2)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Minimum Covariance Determinant MAE: %.3f' % mae)
print('Minimum Covariance Determinant R2: %.3f' % reg.score(X_train2, y_train2))
print('RMSE: %.3f' % rmse)
Minimum Covariance Determinant MAE: 3057.157 Minimum Covariance Determinant R2: 0.035 RMSE: 3932.082
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train2,y_train2)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Minimum Covariance Determinant', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train2, y_train2).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
# identify outliers in the training dataset
lof = LocalOutlierFactor(n_neighbors=10, contamination=0.1)
yhat = lof.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train3, y_train3 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train3, y_train3)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Local Outlier Factor MAE: %.3f' % mae)
print('Local Outlier Factor R2: %.3f' % reg.score(X_train3, y_train3))
print('RMSE: %.3f' % rmse)
Local Outlier Factor MAE: 3101.212 Local Outlier Factor R2: 0.097 RMSE: 3962.171
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train3,y_train3)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Local Outlier Factor', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train3, y_train3).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
# identify outliers in the training dataset
ee = OneClassSVM(nu=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train4, y_train4 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train4, y_train4)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('One-Class SVM MAE: %.3f' % mae)
print('One-Class SVM R2: %.3f' % reg.score(X_train4, y_train4))
print('RMSE: %.3f' % rmse)
One-Class SVM MAE: 3088.410 One-Class SVM R2: 0.063 RMSE: 3960.937
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train4,y_train4)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('One-Class SVM', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train4, y_train4).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
poly = PolynomialFeatures(degree=10)
X_train3_poly = poly.fit_transform(X_train3)
X_train3_poly
model = LinearRegression()
train_y_ = model.fit(X_train3_poly, y_train3)
X_test_poly = poly.fit_transform(X_test)
yhat = model.predict(X_test_poly)
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Polynomial MAE: %.3f' % mae)
print('Polynomial R2: %.3f' % model.score(X_train3_poly, y_train3))
print('RMSE: %.3f' % rmse)
Polynomial MAE: 2967.230 Polynomial R2: 0.164 RMSE: 3790.096
X = mega['followers_count']
y = mega["spot_compensation_in_dollars"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)
# In order to feed x to sklearn, it should be a 2D array (a matrix)
# Therefore, we must reshape X_train and X_test
# Note that this will not be needed when we've got more than 1 feature (as the inputs will be a 2D array by default)
X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)
# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)
(10, 1) (10,)
# identify outliers in the training dataset
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train1, y_train1 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train1, y_train1)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat)
print('Isolation Forest MAE: %.3f' % mae)
print('Isolation Forest R2: %.3f' % reg.score(X_train1, y_train1))
print('RMSE: %.3f' % rmse)
Isolation Forest MAE: 5419.028 Isolation Forest R2: 0.071 RMSE: 49659849.050
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train1,y_train1)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Isolation Forest', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train1, y_train1).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=9
# identify outliers in the training dataset
ee = EllipticEnvelope(contamination=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train2, y_train2 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train2, y_train2)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Minimum Covariance Determinant MAE: %.3f' % mae)
print('Minimum Covariance Determinant R2: %.3f' % reg.score(X_train2, y_train2))
print('RMSE: %.3f' % rmse)
Minimum Covariance Determinant MAE: 5419.028 Minimum Covariance Determinant R2: 0.071 RMSE: 7046.974
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train2,y_train2)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Minimum Covariance Determinant', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train2, y_train2).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=9
# identify outliers in the training dataset
lof = LocalOutlierFactor(n_neighbors=10, contamination=0.1)
yhat = lof.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train3, y_train3 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train3, y_train3)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Local Outlier Factor MAE: %.3f' % mae)
print('Local Outlier Factor R2: %.3f' % reg.score(X_train3, y_train3))
print('RMSE: %.3f' % rmse)
Local Outlier Factor MAE: 5381.151 Local Outlier Factor R2: 0.122 RMSE: 7054.052
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train3,y_train3)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Local Outlier Factor', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train3, y_train3).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=9
# identify outliers in the training dataset
ee = OneClassSVM(nu=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train4, y_train4 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train4, y_train4)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('One-Class SVM MAE: %.3f' % mae)
print('One-Class SVM R2: %.3f' % reg.score(X_train4, y_train4))
print('RMSE: %.3f' % rmse)
One-Class SVM MAE: 5724.853 One-Class SVM R2: 0.233 RMSE: 7171.741
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train4,y_train4)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('One-Class SVM', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train4, y_train4).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=8
poly = PolynomialFeatures(degree=3)
X_train4_poly = poly.fit_transform(X_train4)
X_train4_poly
model = LinearRegression()
train_y_ = model.fit(X_train4_poly, y_train4)
X_test_poly = poly.fit_transform(X_test)
yhat = model.predict(X_test_poly)
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Polynomial MAE: %.3f' % mae)
print('Polynomial R2: %.3f' % model.score(X_train4_poly, y_train4))
print('RMSE: %.3f' % rmse)
Polynomial MAE: 13459.696 Polynomial R2: 0.848 RMSE: 17154.534
While predicting spot_compensation_in_dollars and deliverable_price_in_dollars with respect to followers_count and unit_id etc. It gives 80% score which means these variables are highly correlated in predicting the both price colunn attributes(deliverable_price and spot price)
nano = pd.read_excel("nano.xlsx")
nano.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 59 entries, 0 to 58 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 followers_count 59 non-null int64 1 deliverable_price_in_dollars 59 non-null float64 2 spot_compensation_in_dollars 59 non-null int64 3 unit_id 59 non-null int64 4 type 59 non-null object dtypes: float64(1), int64(3), object(1) memory usage: 2.4+ KB
x = nano["followers_count"]
y = nano["deliverable_price_in_dollars"]
model = sm.OLS(y, x).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 10}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
plt.savefig('output.png')
x = nano["followers_count"]
y = nano["spot_compensation_in_dollars"]
model = sm.OLS(y, x).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 10}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
plt.savefig('output.png')
nano.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| followers_count | 59.0 | 3646.000000 | 2699.682178 | 11.0 | 1507.00 | 2808.0 | 5790.5 | 9362.0 |
| deliverable_price_in_dollars | 59.0 | 4049.203390 | 5240.300565 | 410.0 | 1383.75 | 2050.0 | 4100.0 | 21525.0 |
| spot_compensation_in_dollars | 59.0 | 2062.033898 | 2737.102375 | 200.0 | 675.00 | 1000.0 | 2000.0 | 10500.0 |
| unit_id | 59.0 | 49536.745763 | 3827.341157 | 40501.0 | 48059.00 | 51096.0 | 52418.5 | 53699.0 |
sns.pairplot(nano)
<seaborn.axisgrid.PairGrid at 0x12653278d90>
#Outlier Treatment
def outlier_detect(df):
for i in df.describe().columns:
Q1=df.describe().at['25%',i]
Q3=df.describe().at['75%',i]
IQR=Q3 - Q1
LTV=Q1 - 1.5 * IQR
UTV=Q3 + 1.5 * IQR
x=np.array(df[i])
p=[]
for j in x:
if j < LTV or j>UTV:
p.append(df[i].median())
else:
p.append(j)
df[i]=p
return df
nano = outlier_detect(nano)
nano.drop("type", axis=1, inplace=True)
fig = px.scatter(nano, x="followers_count", y="deliverable_price_in_dollars",
color="followers_count",
title="Scatter Plot followers_count vs deliverable_price_in_dollars",
)
fig.update_traces(marker=dict(size=12,
line=dict(width=2,
color='DarkSlateGrey')),
selector=dict(mode='markers'))
fig.show()
fig = px.scatter(nano, x="followers_count", y="spot_compensation_in_dollars",
hover_data=["unit_id"],
color="followers_count",
title="Scatter Plot followers_count vs spot_compensation_in_dollars",
)
fig.update_traces(marker=dict(size=12,
line=dict(width=2,
color='DarkSlateGrey')),
selector=dict(mode='markers'))
fig.show()
sns.set_style("darkgrid")
sns.heatmap(nano.corr(), annot=True)
<AxesSubplot:>
X = nano['followers_count']
y = nano["deliverable_price_in_dollars"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)
# In order to feed x to sklearn, it should be a 2D array (a matrix)
# Therefore, we must reshape X_train and X_test
# Note that this will not be needed when we've got more than 1 feature (as the inputs will be a 2D array by default)
X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)
# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)
(39, 1) (39,)
# identify outliers in the training dataset
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train1, y_train1 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train1, y_train1)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat)
print('Isolation Forest MAE: %.3f' % mae)
print('Isolation Forest R2: %.3f' % reg.score(X_train1, y_train1))
print('RMSE: %.3f' % rmse)
Isolation Forest MAE: 1219.163 Isolation Forest R2: 0.003 RMSE: 2191916.671
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train1,y_train1)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Isolation Forest', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train1, y_train1).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
# identify outliers in the training dataset
ee = EllipticEnvelope(contamination=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train2, y_train2 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train2, y_train2)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Minimum Covariance Determinant MAE: %.3f' % mae)
print('Minimum Covariance Determinant R2: %.3f' % reg.score(X_train2, y_train2))
print('RMSE: %.3f' % rmse)
Minimum Covariance Determinant MAE: 1193.798 Minimum Covariance Determinant R2: 0.042 RMSE: 1440.532
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train2,y_train2)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Minimum Covariance Determinant', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train2, y_train2).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
# identify outliers in the training dataset
lof = LocalOutlierFactor(n_neighbors=10, contamination=0.1)
yhat = lof.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train3, y_train3 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train3, y_train3)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Local Outlier Factor MAE: %.3f' % mae)
print('Local Outlier Factor R2: %.3f' % reg.score(X_train3, y_train3))
print('RMSE: %.3f' % rmse)
Local Outlier Factor MAE: 1197.429 Local Outlier Factor R2: 0.059 RMSE: 1440.845
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train3,y_train3)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Local Outlier Factor', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train3, y_train3).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
# identify outliers in the training dataset
ee = OneClassSVM(nu=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train4, y_train4 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train4, y_train4)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('One-Class SVM MAE: %.3f' % mae)
print('One-Class SVM R2: %.3f' % reg.score(X_train4, y_train4))
print('RMSE: %.3f' % rmse)
One-Class SVM MAE: 1192.820 One-Class SVM R2: 0.043 RMSE: 1438.110
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train4,y_train4)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('One-Class SVM', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train4, y_train4).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
poly = PolynomialFeatures(degree=4)
X_train3_poly = poly.fit_transform(X_train3)
X_train3_poly
model = LinearRegression()
train_y_ = model.fit(X_train3_poly, y_train3)
X_test_poly = poly.fit_transform(X_test)
yhat = model.predict(X_test_poly)
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Polynomial MAE: %.3f' % mae)
print('Polynomial R2: %.3f' % model.score(X_train3_poly, y_train3))
print('RMSE: %.3f' % rmse)
Polynomial MAE: 1199.604 Polynomial R2: 0.173 RMSE: 1480.782
X = mega['followers_count']
y = mega["spot_compensation_in_dollars"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)
# In order to feed x to sklearn, it should be a 2D array (a matrix)
# Therefore, we must reshape X_train and X_test
# Note that this will not be needed when we've got more than 1 feature (as the inputs will be a 2D array by default)
X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)
# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)
(10, 1) (10,)
# identify outliers in the training dataset
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train1, y_train1 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train1, y_train1)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat)
print('Isolation Forest MAE: %.3f' % mae)
print('Isolation Forest R2: %.3f' % reg.score(X_train1, y_train1))
print('RMSE: %.3f' % rmse)
Isolation Forest MAE: 5419.028 Isolation Forest R2: 0.071 RMSE: 49659849.050
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train1,y_train1)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Isolation Forest', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train1, y_train1).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=9
# identify outliers in the training dataset
ee = EllipticEnvelope(contamination=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train2, y_train2 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train2, y_train2)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Minimum Covariance Determinant MAE: %.3f' % mae)
print('Minimum Covariance Determinant R2: %.3f' % reg.score(X_train2, y_train2))
print('RMSE: %.3f' % rmse)
Minimum Covariance Determinant MAE: 5419.028 Minimum Covariance Determinant R2: 0.071 RMSE: 7046.974
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train2,y_train2)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Minimum Covariance Determinant', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train2, y_train2).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=9
# identify outliers in the training dataset
lof = LocalOutlierFactor(n_neighbors=10, contamination=0.1)
yhat = lof.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train3, y_train3 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train3, y_train3)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Local Outlier Factor MAE: %.3f' % mae)
print('Local Outlier Factor R2: %.3f' % reg.score(X_train3, y_train3))
print('RMSE: %.3f' % rmse)
Local Outlier Factor MAE: 5381.151 Local Outlier Factor R2: 0.122 RMSE: 7054.052
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train3,y_train3)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Local Outlier Factor', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train3, y_train3).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=9
# identify outliers in the training dataset
ee = OneClassSVM(nu=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train4, y_train4 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train4, y_train4)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('One-Class SVM MAE: %.3f' % mae)
print('One-Class SVM R2: %.3f' % reg.score(X_train4, y_train4))
print('RMSE: %.3f' % rmse)
One-Class SVM MAE: 5724.853 One-Class SVM R2: 0.233 RMSE: 7171.741
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train4,y_train4)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('One-Class SVM', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train4, y_train4).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=8
poly = PolynomialFeatures(degree=3)
X_train4_poly = poly.fit_transform(X_train4)
X_train4_poly
model = LinearRegression()
train_y_ = model.fit(X_train4_poly, y_train4)
X_test_poly = poly.fit_transform(X_test)
yhat = model.predict(X_test_poly)
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Polynomial MAE: %.3f' % mae)
print('Polynomial R2: %.3f' % model.score(X_train4_poly, y_train4))
print('RMSE: %.3f' % rmse)
Polynomial MAE: 13459.696 Polynomial R2: 0.848 RMSE: 17154.534
While predicting spot_compensation_in_dollars and deliverable_price_in_dollars with respect to followers_count and unit_id etc. It gives 100% score which means these variables are highly correlated in predicting the both price colunn attributes(deliverable_price and spot price)
web = pd.read_excel("web.xlsx")
web.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 11 entries, 0 to 10 Data columns (total 5 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 followers_count 11 non-null int64 1 deliverable_price_in_dollars 11 non-null float64 2 spot_compensation_in_dollars 11 non-null float64 3 unit_id 11 non-null int64 4 type 11 non-null object dtypes: float64(2), int64(2), object(1) memory usage: 568.0+ bytes
x = web["followers_count"]
y = web["spot_compensation_in_dollars"]
model = sm.OLS(y, x).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 10}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
plt.savefig('output.png')
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=11
x = web["followers_count"]
y = web["deliverable_price_in_dollars"]
model = sm.OLS(y, x).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 10}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
plt.savefig('output.png')
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=11
web.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| followers_count | 11.0 | 1.826295e+06 | 1.235187e+06 | 1029606.0 | 1122045.00 | 1469865.0 | 1758976.5 | 5349027.0 |
| deliverable_price_in_dollars | 11.0 | 1.198093e+04 | 8.923663e+03 | 0.0 | 5555.09 | 10762.5 | 19400.0 | 24600.0 |
| spot_compensation_in_dollars | 11.0 | 6.592691e+03 | 4.902140e+03 | 0.0 | 2709.80 | 5250.0 | 11500.0 | 12500.0 |
| unit_id | 11.0 | 4.712045e+04 | 3.894937e+03 | 41045.0 | 43959.00 | 47911.0 | 50161.5 | 52926.0 |
sns.pairplot(web)
<seaborn.axisgrid.PairGrid at 0x126567f26a0>
#Outlier Treatment
def outlier_detect(df):
for i in df.describe().columns:
Q1=df.describe().at['25%',i]
Q3=df.describe().at['75%',i]
IQR=Q3 - Q1
LTV=Q1 - 1.5 * IQR
UTV=Q3 + 1.5 * IQR
x=np.array(df[i])
p=[]
for j in x:
if j < LTV or j>UTV:
p.append(df[i].median())
else:
p.append(j)
df[i]=p
return df
web = outlier_detect(web)
web.drop("type", axis=1, inplace=True)
fig = px.scatter(web, x="followers_count", y="deliverable_price_in_dollars",
color="followers_count",
title="Scatter Plot followers_count vs deliverable_price_in_dollars",
)
fig.update_traces(marker=dict(size=12,
line=dict(width=2,
color='DarkSlateGrey')),
selector=dict(mode='markers'))
fig.show()
fig = px.scatter(web, x="followers_count", y="spot_compensation_in_dollars",
hover_data=["unit_id"],
color="followers_count",
title="Scatter Plot followers_count vs spot_compensation_in_dollars",
)
fig.update_traces(marker=dict(size=12,
line=dict(width=2,
color='DarkSlateGrey')),
selector=dict(mode='markers'))
fig.show()
sns.set_style("darkgrid")
sns.heatmap(web.corr(), annot=True)
<AxesSubplot:>
X = web['followers_count']
y = web["deliverable_price_in_dollars"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)
# In order to feed x to sklearn, it should be a 2D array (a matrix)
# Therefore, we must reshape X_train and X_test
# Note that this will not be needed when we've got more than 1 feature (as the inputs will be a 2D array by default)
X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)
# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)
(7, 1) (7,)
# identify outliers in the training dataset
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train1, y_train1 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train1, y_train1)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat)
print('Isolation Forest MAE: %.3f' % mae)
print('Isolation Forest R2: %.3f' % reg.score(X_train1, y_train1))
print('RMSE: %.3f' % rmse)
Isolation Forest MAE: 7047.335 Isolation Forest R2: 0.104 RMSE: 94301228.701
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train1,y_train1)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Isolation Forest', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train1, y_train1).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\statsmodels\stats\stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 6 samples were given.
# identify outliers in the training dataset
ee = EllipticEnvelope(contamination=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train2, y_train2 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train2, y_train2)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Minimum Covariance Determinant MAE: %.3f' % mae)
print('Minimum Covariance Determinant R2: %.3f' % reg.score(X_train2, y_train2))
print('RMSE: %.3f' % rmse)
Minimum Covariance Determinant MAE: 7047.335 Minimum Covariance Determinant R2: 0.104 RMSE: 9710.882
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train2,y_train2)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Minimum Covariance Determinant', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train2, y_train2).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\statsmodels\stats\stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 6 samples were given.
# identify outliers in the training dataset
lof = LocalOutlierFactor(n_neighbors=10, contamination=0.1)
yhat = lof.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train3, y_train3 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train3, y_train3)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Local Outlier Factor MAE: %.3f' % mae)
print('Local Outlier Factor R2: %.3f' % reg.score(X_train3, y_train3))
print('RMSE: %.3f' % rmse)
Local Outlier Factor MAE: 6184.800 Local Outlier Factor R2: 0.005 RMSE: 8075.137
C:\Users\DELL\anaconda3\lib\site-packages\sklearn\neighbors\_lof.py:274: UserWarning: n_neighbors (10) is greater than the total number of samples (7). n_neighbors will be set to (n_samples - 1) for estimation.
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train3,y_train3)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Local Outlier Factor', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train3, y_train3).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\statsmodels\stats\stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 6 samples were given.
# identify outliers in the training dataset
ee = OneClassSVM(nu=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train4, y_train4 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train4, y_train4)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('One-Class SVM MAE: %.3f' % mae)
print('One-Class SVM R2: %.3f' % reg.score(X_train4, y_train4))
print('RMSE: %.3f' % rmse)
One-Class SVM MAE: 11543.009 One-Class SVM R2: 0.420 RMSE: 16265.347
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train4,y_train4)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('One-Class SVM', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('deliverable_price_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train4, y_train4).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\statsmodels\stats\stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 5 samples were given.
poly = PolynomialFeatures(degree=3)
X_train4_poly = poly.fit_transform(X_train4)
X_train4_poly
model = LinearRegression()
train_y_ = model.fit(X_train4_poly, y_train4)
X_test_poly = poly.fit_transform(X_test)
yhat = model.predict(X_test_poly)
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Polynomial MAE: %.3f' % mae)
print('Polynomial R2: %.3f' % model.score(X_train4_poly, y_train4))
print('RMSE: %.3f' % rmse)
Polynomial MAE: 107200.131 Polynomial R2: 0.947 RMSE: 189914.795
X = mega['followers_count']
y = mega["spot_compensation_in_dollars"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=0)
# In order to feed x to sklearn, it should be a 2D array (a matrix)
# Therefore, we must reshape X_train and X_test
# Note that this will not be needed when we've got more than 1 feature (as the inputs will be a 2D array by default)
X_train = X_train.values.reshape(-1, 1)
X_test = X_test.values.reshape(-1, 1)
# summarize the shape of the training dataset
print(X_train.shape, y_train.shape)
(10, 1) (10,)
# identify outliers in the training dataset
iso = IsolationForest(contamination=0.1)
yhat = iso.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train1, y_train1 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train1, y_train1)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat)
print('Isolation Forest MAE: %.3f' % mae)
print('Isolation Forest R2: %.3f' % reg.score(X_train1, y_train1))
print('RMSE: %.3f' % rmse)
Isolation Forest MAE: 5419.028 Isolation Forest R2: 0.071 RMSE: 49659849.050
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train1,y_train1)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Isolation Forest', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train1, y_train1).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=9
# identify outliers in the training dataset
ee = EllipticEnvelope(contamination=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train2, y_train2 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train2, y_train2)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Minimum Covariance Determinant MAE: %.3f' % mae)
print('Minimum Covariance Determinant R2: %.3f' % reg.score(X_train2, y_train2))
print('RMSE: %.3f' % rmse)
Minimum Covariance Determinant MAE: 5419.028 Minimum Covariance Determinant R2: 0.071 RMSE: 7046.974
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train2,y_train2)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Minimum Covariance Determinant', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train2, y_train2).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=9
# identify outliers in the training dataset
lof = LocalOutlierFactor(n_neighbors=10, contamination=0.1)
yhat = lof.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train3, y_train3 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train3, y_train3)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Local Outlier Factor MAE: %.3f' % mae)
print('Local Outlier Factor R2: %.3f' % reg.score(X_train3, y_train3))
print('RMSE: %.3f' % rmse)
Local Outlier Factor MAE: 5381.151 Local Outlier Factor R2: 0.122 RMSE: 7054.052
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train3,y_train3)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('Local Outlier Factor', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train3, y_train3).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=9
# identify outliers in the training dataset
ee = OneClassSVM(nu=0.01)
yhat = ee.fit_predict(X_train)
# select all rows that are not outliers
mask = yhat != -1
X_train4, y_train4 = X_train[mask, :], y_train[mask]
# fit the model
reg = LinearRegression()
reg.fit(X_train4, y_train4)
# evaluate the model
yhat = reg.predict(X_test)
# evaluate predictions
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('One-Class SVM MAE: %.3f' % mae)
print('One-Class SVM R2: %.3f' % reg.score(X_train4, y_train4))
print('RMSE: %.3f' % rmse)
One-Class SVM MAE: 5724.853 One-Class SVM R2: 0.233 RMSE: 7171.741
# There are different ways to plot the data - here's the matplotlib code
plt.scatter(X_train4,y_train4)
# Parametrized version of the regression line
yhat = reg.coef_*X_test + reg.intercept_
fig = plt.plot(X_test,yhat,color='red', linestyle='dashdot',linewidth=2)
# Labelling our axes
plt.title('One-Class SVM', fontsize = 20)
plt.xlabel('followers_count', fontsize = 20)
plt.ylabel('spot_compensation_in_dollars', fontsize = 20)
plt.show()
model = sm.OLS(X_train4, y_train4).fit()
predictions = model.predict(x)
plt.rc('figure', figsize=(12, 7))
plt.text(0.01, 0.05, str(model.summary()), {'fontsize': 12}, fontproperties = 'monospace')
plt.axis('off')
plt.tight_layout()
C:\Users\DELL\anaconda3\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=8
poly = PolynomialFeatures(degree=3)
X_train4_poly = poly.fit_transform(X_train4)
X_train4_poly
model = LinearRegression()
train_y_ = model.fit(X_train4_poly, y_train4)
X_test_poly = poly.fit_transform(X_test)
yhat = model.predict(X_test_poly)
mae = mean_absolute_error(y_test, yhat)
rmse = mean_squared_error(y_test, yhat, squared=False)
print('Polynomial MAE: %.3f' % mae)
print('Polynomial R2: %.3f' % model.score(X_train4_poly, y_train4))
print('RMSE: %.3f' % rmse)
Polynomial MAE: 13459.696 Polynomial R2: 0.848 RMSE: 17154.534
While predicting spot_compensation_in_dollars and deliverable_price_in_dollars with respect to followers_count and unit_id etc. It gives 77% score which means these variables are highly correlated in predicting the both price colunn attributes(deliverable_price and spot price)